getwd()
## [1] "/Users/meabhhartney/Desktop/DATA6530"
Sys.time()
## [1] "2023-03-06 22:30:35 EST"
alcoholsales <- readxl::read_excel("data/AlcoholSales.xlsx")
alcoholsales
alc <- alcoholsales %>%
mutate(Quarter = yearquarter(Date)) %>%
as_tsibble(index = Quarter)
alc
alc %>%
filter(year(Quarter) <= 2016) -> alc_train
alc_train
alc %>%
filter(year(Quarter) > 2016) -> alc_test
alc_test
alc %>%
autoplot(Sales)
alc %>%
autoplot(GDP)
alc %>%
autoplot(CPI)
alc %>%
autoplot(Sales) + labs(y = "Sales") +
geom_ribbon(aes(xmin = 18200, xmax = 18900), fill = "pink", alpha = 0.4) +
annotate("text", x = 18500, y = 2000, label = "Covid-19 Pandemic", col = "red", size = 3)
#geom_ribbon(aes(xmin = 1979.98, xmax = 1989.13), fill = "pink", alpha = 0.4) +
#annotate("text", x = 1984.5, y = 10, label = "Soviet-Afghan War", col = "red", size = 3)
alc %>%
autoplot(GDP) + labs(y = "GDP") +
geom_ribbon(aes(xmin = 18200, xmax = 18900), fill = "pink", alpha = 0.4) +
annotate("text", x = 18500, y = 2000, label = "Covid-19 Pandemic", col = "red", size = 3)
alc %>%
features(Sales, guerrero)
As lamba is very close to 0, a log transformation of the data would be best
alc %>%
autoplot(log(Sales))
alc_fit <- alc_train %>%
model(
Mean = MEAN(log(Sales)),
`Naïve` = NAIVE(log(Sales)),
`Seasonal naïve` = SNAIVE(log(Sales)),
Drift = RW(log(Sales) ~ drift())
)
alc_fc <- alc_fit %>%
forecast(alc_test)
alc_fc %>%
autoplot(alc,
level = NULL
) +
labs(
y = "$ (millions)",
title = "Forecasts for quarterly sales"
) +
guides(colour = guide_legend(title = "Forecast"))
accuracy(alc_fit)
accuracy(alc_fc, alc)
- Time Series Regression Models using the train data (1992-2016)
reg_model0 <- alc_train %>%
model(TSLM(log(Sales) ~ trend() + season() + GDP + CPI))
report(reg_model0)
## Series: Sales
## Model: TSLM
## Transformation: log(Sales)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.0544804 -0.0118517 -0.0007681 0.0138791 0.0580104
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.317e+00 1.210e-01 52.224 < 2e-16 ***
## trend() -1.590e-03 1.284e-03 -1.238 0.219
## season()year2 1.032e-01 6.210e-03 16.623 < 2e-16 ***
## season()year3 1.279e-01 6.238e-03 20.500 < 2e-16 ***
## season()year4 2.318e-01 6.824e-03 33.969 < 2e-16 ***
## GDP 1.757e-07 2.823e-08 6.224 1.38e-08 ***
## CPI 5.192e-03 8.087e-04 6.420 5.67e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.02075 on 93 degrees of freedom
## Multiple R-squared: 0.9955, Adjusted R-squared: 0.9952
## F-statistic: 3409 on 6 and 93 DF, p-value: < 2.22e-16
reg_model1 <- alc_train %>%
model(TSLM(log(Sales) ~ season() + GDP+ CPI))
report(reg_model1)
## Series: Sales
## Model: TSLM
## Transformation: log(Sales)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.0557126 -0.0122025 -0.0005554 0.0143956 0.0550294
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.451e+00 5.428e-02 118.856 < 2e-16 ***
## season()year2 1.050e-01 6.065e-03 17.308 < 2e-16 ***
## season()year3 1.294e-01 6.138e-03 21.076 < 2e-16 ***
## season()year4 2.344e-01 6.521e-03 35.938 < 2e-16 ***
## GDP 1.502e-07 1.935e-08 7.763 1.00e-11 ***
## CPI 4.485e-03 5.743e-04 7.810 7.98e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.02081 on 94 degrees of freedom
## Multiple R-squared: 0.9954, Adjusted R-squared: 0.9952
## F-statistic: 4067 on 5 and 94 DF, p-value: < 2.22e-16
fc0 <- forecast(reg_model0,alc_test)
fc0 %>%
autoplot(alc)
accuracy(reg_model0)
accuracy(fc0, alc)
fc1 <- forecast(reg_model1,alc_test)
fc1 %>%
autoplot(alc)
accuracy(reg_model1)
accuracy(fc1, alc)
alc_train %>%
autoplot(Sales, col = "blue") +
geom_line(data = augment(reg_model0), aes(y = .fitted), col = "red") + labs(title = "Regression Model using trend, season, GDP and CPI (reg_model0)")
alc_train %>%
autoplot(Sales, col = "blue") +
geom_line(data = augment(reg_model1), aes(y = .fitted), col = "red") + labs(title = "Regression Model using season, GDP and CPI (reg_model1)")
alc %>%
autoplot(log(Sales))
alc %>%
features(Sales,unitroot_kpss)
Small P value, differencing is required
alc %>%
features(Sales, unitroot_ndiffs)
alc %>%
features(Sales, unitroot_nsdiffs)
alc_train %>%
autoplot(log(Sales) %>% difference())
alc %>%
gg_tsdisplay(difference(log(Sales),4), plot_type='partial', lag = 36) +
labs(title = "Quarterly sales with differencing", y="")
ARIMA models
fit1 <- alc_train %>%
model(
arima011210 = ARIMA(Sales ~ pdq(0,1,1) + PDQ(2,1,0)),
arima012110 = ARIMA(Sales ~ pdq(0,1,2) + PDQ(1,1,0)),
auto = ARIMA(Sales, stepwise = FALSE, approx = FALSE)
)
fit1 %>% pivot_longer(everything(), names_to = "Model name",
values_to = "Orders")
glance(fit1) %>% arrange(AICc) %>% select(.model:BIC)
forecast(fit1, alc_test) %>%
autoplot(alc) +
labs(title = "Forecasting quarterly sales with ARIMA models",
y="$ million")
forecast(fit1, alc_test) %>%
filter(.model=='auto') %>%
autoplot(alc) +
labs(title = "Forecasting quarterly sales with auto ARIMA model",
y="$ million")
ETS model
fit_ets <-alc %>%
model(
SES = ETS(log(Sales) ~ error("A") + trend("N") + season("N")),
Holt = ETS(log(Sales) ~ error("A") + trend("A") + season("N")),
Damped = ETS(log(Sales) ~ error("A") + trend("Ad") + season("N")),
auto = ETS(log(Sales))
)
accuracy(fit_ets)
forecast(fit_ets, alc_test) %>%
autoplot(alc)
forecast(fit_ets, alc_test) %>%
filter(.model=='auto') %>%
autoplot(alc)
fit_ets %>%
forecast(h = 10) %>%
autoplot(alc)
fit_ets %>%
forecast(h = 10) %>%
filter(.model=='auto') %>%
autoplot(alc)
fitall <- alc_train %>%
model(
"Holt" = ETS(log(Sales) ~ error("A") + trend("A") + season("N")),
"ARIMA" = ARIMA(log(Sales)),
"naïve" = NAIVE(log(Sales)),
"Regression" = TSLM(log(Sales) ~ season() + GDP + CPI)
)
fcall <- forecast(fitall, alc_test)
accuracy(fitall)
accuracy(fcall, alc)
alc %>%
autoplot(Sales) +
autolayer(fcall) +
labs(x = "Quarter",
title = "Forecasting Model Comparison",
y = "$ million"
)
Regression model is best
fitall %>% select("Regression") %>% report()
## Series: Sales
## Model: TSLM
## Transformation: log(Sales)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.0557126 -0.0122025 -0.0005554 0.0143956 0.0550294
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.451e+00 5.428e-02 118.856 < 2e-16 ***
## season()year2 1.050e-01 6.065e-03 17.308 < 2e-16 ***
## season()year3 1.294e-01 6.138e-03 21.076 < 2e-16 ***
## season()year4 2.344e-01 6.521e-03 35.938 < 2e-16 ***
## GDP 1.502e-07 1.935e-08 7.763 1.00e-11 ***
## CPI 4.485e-03 5.743e-04 7.810 7.98e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.02081 on 94 degrees of freedom
## Multiple R-squared: 0.9954, Adjusted R-squared: 0.9952
## F-statistic: 4067 on 5 and 94 DF, p-value: < 2.22e-16